# encoding: utf-8

import json
import pandas as pd

# dtypes = "dev"
dtypes = "train"

file_name = f"./{dtypes}.csv"

datas = pd.read_csv(file_name, dtype=str)

print(datas.head())
print(datas.columns)
# Index(['sentence', 'label', 'dataset'], dtype='object')

data_str = datas.to_json(orient="records")

data_json = json.loads(data_str)

results = []
labels = ["0", "1"]

for data in data_json:
    # print(data)
    # {'sentence': '一百多和三十的也看不出什么区别，包装精美，质量应该不错。', 'label': '1', 'dataset': 'jd'}
    text = data.get("sentence")
    if not text:
        continue
    text = "".join(text.split())
    label = data.get("label")
    if label not in labels:
        continue
    if text and label:
        results.append(f"{text}__{label}\n")


print("total count =", len(results))

with open(f"./{dtypes}.txt", "w", encoding="utf-8") as f:
    for line in results:
        f.write(line)

print(f"data: {dtypes} done")
